library(tidyverse)
library(caret)
library(e1071)
library(factoextra)
library(plotly)
library(ggcorrplot)
library(fields)
brain_data <- read_csv("../data/IBIS_brain_data_ex.csv")
brain_data <- brain_data %>%
select(names(brain_data)[grepl("V24|RiskGroup|CandID", names(brain_data))]) %>%
select(CandID:Uncinate_R_V24, RiskGroup:V24_VDQ) %>%
drop_na()
brain_data_x <- brain_data %>% select(EACSF_V24:Uncinate_R_V24)
pca_brain <- princomp(x=brain_data_x, cor=TRUE,scores=TRUE)
# Look at screen plot to view amount of variance explained by PC
fviz_eig(pca_brain)
# Just look at first 2 PCs
load_subset <- data.frame(pca_brain$loadings[,1:2]) %>%
rownames_to_column(var = "variable") %>%
pivot_longer(cols=c("Comp.1", "Comp.2"), names_to="comp_number", values_to="loading")
# Plot loadings
ggplot(data=load_subset, mapping=aes(x=variable, y=loading))+
geom_bar(stat="identity")+
facet_grid(comp_number~.)+
theme_bw()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Hard to interpret, need to reduce size
load_subset_normed <- data.frame(pca_brain$loadings[,1:2]) %>%
rownames_to_column(var = "variable") %>%
mutate(Comp.1 = scale(Comp.1),
Comp.2 = scale(Comp.2)) %>%
pivot_longer(cols=c("Comp.1", "Comp.2"), names_to="comp_number", values_to="loading")
ggplot(data=load_subset_normed, mapping=aes(x=loading))+
geom_histogram()+
facet_grid(comp_number~.)+
theme_bw()
load_subset_normed <- load_subset_normed %>%
filter(loading>0.5|loading< -0.5) %>%
mutate(large_loading = factor(ifelse(loading>1.96|loading< -1.96, 1, 0)))
ggplot(data=load_subset_normed, mapping=aes(x=variable, y=loading, fill=large_loading))+
geom_bar(stat="identity")+
facet_grid(comp_number~.)+
theme_bw()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Can look at PCA scores for each person and plot as well
fviz_pca_ind(pca_brain,
col.ind = "cos2", # Color by the quality of representation
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
# Do these relate to diagnosis?
fviz_pca_ind(pca_brain,
col.ind = brain_data$RiskGroup, # color by groups
palette = c("#0072B2", "#D55E00", "#CC79A7"),
addEllipses = TRUE, # Concentration ellipses
ellipse.type = "confidence",
legend.title = "Groups",
label = "none",
repel = TRUE
)
# Can look at high variables contribute to components, labels make this messy
fviz_pca_var(pca_brain,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
fviz_pca_var(pca_brain,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
select.var = list("cos2"=10),
repel = TRUE # Avoid text overlapping
)
# Can extract scores, use for further analysis
pc_scores <- pca_brain$scores[,1:2]
brain_data_w_pcs <- data.frame(brain_data, pc_scores)
# Create train and testing sets
brain_data_w_pcs <- brain_data_w_pcs %>%
filter(grepl("HR", RiskGroup)) %>%
mutate(RiskGroup=fct_relevel(factor(RiskGroup), "HR-Neg"))
tt_indices <- createDataPartition(y=brain_data_w_pcs$RiskGroup, p=0.6, list=FALSE)
# Train using PCs with logistic regression
train_data <- brain_data_w_pcs[tt_indices,]
test_data <- brain_data_w_pcs[-tt_indices,]
model_fit <- glm(RiskGroup~`Comp.1`+`Comp.1`, data=train_data, family=binomial)
test_data$model_prob <- predict(model_fit, type="response", newdata = test_data)
test_data$model_pred <-
fct_relevel(factor(ifelse(test_data$model_prob>0.25, "HR-ASD", "HR-Neg")), "HR-Neg")
# Plot PCs by probability, make plot interactive
ggplotly(ggplot(test_data, mapping=aes(x=`Comp.1`, y=`Comp.2`,
color=model_prob, label=RiskGroup))+
geom_point()+
scale_color_gradient2(low="black", mid = "red", high = "yellow",
midpoint = median(test_data$model_prob))+
theme_bw())
# Histo of PCs by diagnosis
ggplotly(ggplot(test_data, mapping=aes(x=`Comp.1`, fill=RiskGroup))+
geom_histogram())
# Plot confusion matrix as heatmap
test_confusion_matrix <- confusionMatrix(reference=test_data$RiskGroup,
data=test_data$model_pred)$table
# Convert to dataframe
confusion_matrix_df <- data.frame("obs"=c("HR-Neg", "HR-Neg", "HR-ASD", "HR-ASD"),
"pred"=c("HR-Neg", "HR-ASD", "HR-Neg", "HR-ASD"),
"count"=as.vector(confusionMatrix(reference=test_data$RiskGroup,
data=test_data$model_pred)$table))
ggplot(data=confusion_matrix_df, mapping=aes(x=obs, y=pred, fill=count))+
geom_raster()+
geom_text(mapping=aes(label=count), color="white", size=10)+
scale_fill_gradient(low="black", high="red")+
#coord_flip()+
theme_minimal()
# Heatmap with PCs